We will use the simulated data which are provided in the package and can be loaded via:
# library(BDcocolasso)
devtools::load_all()
#> Loading BDcocolasso
pacman::p_load(profvis)
data("simulated_data_missing")
data("simulated_data_additive")
data("simulated_data_missing_block")
data("simulated_data_additive_block")
These datasets correspond to corrupted datasets in the additive error setting and the missing data setting, and to partially corrupted datasets in additive error setting and missing data setting. In the missing data setting, those datasets contain NAs values, whereas in the additive error setting, they do not contain NAs values but the covariates are measured with additive error. We will note that it is essential to know the standard deviation corresponding to the additive error matrix in the additive error setting, in order to run the CoCoLasso algorithm.
We can first perform a classic CoCoLasso regression. To do that, it is important to note that the datasets must be converted to a matrix:
if (params$NLessThanP) {
n_used <- 100
} else {
n_used <- 500
}
n_used
#> [1] 500
y_missing <- simulated_data_missing[1:n_used, 1]
length(y_missing)
#> [1] 500
Z_missing = simulated_data_missing[1:n_used, 2:dim(simulated_data_missing)[2]]
Z_missing = as.matrix(Z_missing)
dim(Z_missing)
#> [1] 500 50
n_missing <- dim(Z_missing)[1]
p_missing <- dim(Z_missing)[2]
y_missing = as.matrix(y_missing)
We can then fit the CoCoLasso model using our data. It is important to specify the type of noise (additive or missing) and the use of CoCoLasso (block=FALSE) or BD-CoCoLasso (block=TRUE). Here, noise is equal to missing because we have NAs values in the dataset. We can use two types of penalties : the classic lasso penalty (penalty=“lasso”) or the SCAD penalty (penalty=“SCAD”). The latter should lead to less bias in the solution, when the signals are strong.
set.seed(123456)
profvis::profvis({
fit_missing.lasso = coco(Z=Z_missing,y=y_missing,n=n_missing,p=p_missing,
step=100,K=4,mu=10,tau=NULL, etol = 1e-4,
noise = "missing",block=FALSE, penalty="lasso")
})
#> [1] "Doing the global data"
#> [1] "Doing the 1 fold"
#> [1] "Doing the 2 fold"
#> [1] "Doing the 3 fold"
#> [1] "Doing the 4 fold"
#> [1] "Early stopping because of error getting too high"
set.seed(123456)
profvis::profvis({
fit_missing.SCAD = coco(Z=Z_missing,y=y_missing,n=n_missing,p=p_missing,
step=100,K=4,mu=10,tau=NULL, etol = 1e-4,
noise = "missing",block=FALSE, penalty="SCAD")
})
#> [1] "Doing the global data"
#> [1] "Doing the 1 fold"
#> [1] "Doing the 2 fold"
#> [1] "Doing the 3 fold"
#> [1] "Doing the 4 fold"
#> [1] "Early Stopping because of convergence of the error"
It is possible to print the fitted object, so as to display the evolution of mean-squared error as a function of the lambda values:
print(fit_missing.lasso)
#>
#> Call: coco(Z = Z_missing, y = y_missing, n = n_missing, p = p_missing, step = 100, K = 4, mu = 10, tau = NULL, etol = 1e-04, noise = "missing", block = FALSE, penalty = "lasso")
#>
#> lambda error
#> 1 0.76763194 -0.0165129
#> 2 0.71589612 -0.0773991
#> 3 0.66764711 -0.1522647
#> 4 0.62264993 -0.2182448
#> 5 0.58068541 -0.2901445
#> 6 0.54154916 -0.3513375
#> 7 0.50505056 -0.4045698
#> 8 0.47101184 -0.4508776
#> 9 0.43926722 -0.4911626
#> 10 0.40966208 -0.5264080
#> 11 0.38205223 -0.5607160
#> 12 0.35630318 -0.5950691
#> 13 0.33228954 -0.6297437
#> 14 0.30989434 -0.6598956
#> 15 0.28900850 -0.6861143
#> 16 0.26953029 -0.7089124
#> 17 0.25136485 -0.7287358
#> 18 0.23442370 -0.7459725
#> 19 0.21862433 -0.7609596
#> 20 0.20388978 -0.7739905
#> 21 0.19014829 -0.7853202
#> 22 0.17733293 -0.7951706
#> 23 0.16538129 -0.8037345
#> 24 0.15423514 -0.8111799
#> 25 0.14384021 -0.8176525
#> 26 0.13414586 -0.8232793
#> 27 0.12510488 -0.8281707
#> 28 0.11667323 -0.8324225
#> 29 0.10880984 -0.8357859
#> 30 0.10147642 -0.8386105
#> 31 0.09463725 -0.8410472
#> 32 0.08825902 -0.8431965
#> 33 0.08231066 -0.8450964
#> 34 0.07676319 -0.8465763
#> 35 0.07158961 -0.8480106
#> 36 0.06676471 -0.8493002
#> 37 0.06226499 -0.8504026
#> 38 0.05806854 -0.8509535
#> 39 0.05415492 -0.8513788
#> 40 0.05050506 -0.8518766
#> 41 0.04710118 -0.8521917
#> 42 0.04392672 -0.8524971
#> 43 0.04096621 -0.8527479
#> 44 0.03820522 -0.8527688
#> 45 0.03563032 -0.8525032
#> 46 0.03322895 -0.8520793
#> 47 0.03098943 -0.8514066
#> 48 0.02890085 -0.8508179
#> 49 0.02695303 -0.8499050
#> 50 0.02513649 -0.8481943
#> 51 0.02344237 -0.8465110
#> 52 0.02186243 -0.8449294
#> 53 0.02038898 -0.8429352
#> 54 0.01901483 -0.8407639
It is also possible to display the coefficients obtained for all values of lambda, or for some specific values of lambda.
coef(fit_missing.lasso, s=fit_missing.lasso$lambda.opt)
#> Coefficient
#> V2 0.511712000
#> V3 0.399633777
#> V4 0.000000000
#> V5 0.000000000
#> V6 0.291618671
#> V7 0.000000000
#> V8 0.000000000
#> V9 0.011370921
#> V10 0.000000000
#> V11 0.000000000
#> V12 0.000000000
#> V13 0.019742768
#> V14 0.000000000
#> V15 0.000000000
#> V16 0.000000000
#> V17 0.000000000
#> V18 0.000000000
#> V19 0.000000000
#> V20 0.000000000
#> V21 0.000000000
#> V22 0.000000000
#> V23 0.000000000
#> V24 0.000000000
#> V25 -0.009171793
#> V26 0.000000000
#> V27 0.000000000
#> V28 0.000000000
#> V29 0.000000000
#> V30 0.000000000
#> V31 0.004107882
#> V32 0.034193197
#> V33 0.000000000
#> V34 0.000000000
#> V35 0.000000000
#> V36 0.000000000
#> V37 0.000000000
#> V38 0.000000000
#> V39 0.000000000
#> V40 0.000000000
#> V41 0.000000000
#> V42 0.000000000
#> V43 0.000000000
#> V44 0.000000000
#> V45 0.000000000
#> V46 0.000000000
#> V47 0.000000000
#> V48 0.008979222
#> V49 0.000000000
#> V50 0.000000000
#> V51 0.000000000
coef(fit_missing.SCAD, s=fit_missing.SCAD$lambda.opt)
#> Coefficient
#> V2 0.5398148
#> V3 0.4201732
#> V4 0.0000000
#> V5 0.0000000
#> V6 0.3071267
#> V7 0.0000000
#> V8 0.0000000
#> V9 0.0000000
#> V10 0.0000000
#> V11 0.0000000
#> V12 0.0000000
#> V13 0.0000000
#> V14 0.0000000
#> V15 0.0000000
#> V16 0.0000000
#> V17 0.0000000
#> V18 0.0000000
#> V19 0.0000000
#> V20 0.0000000
#> V21 0.0000000
#> V22 0.0000000
#> V23 0.0000000
#> V24 0.0000000
#> V25 0.0000000
#> V26 0.0000000
#> V27 0.0000000
#> V28 0.0000000
#> V29 0.0000000
#> V30 0.0000000
#> V31 0.0000000
#> V32 0.0000000
#> V33 0.0000000
#> V34 0.0000000
#> V35 0.0000000
#> V36 0.0000000
#> V37 0.0000000
#> V38 0.0000000
#> V39 0.0000000
#> V40 0.0000000
#> V41 0.0000000
#> V42 0.0000000
#> V43 0.0000000
#> V44 0.0000000
#> V45 0.0000000
#> V46 0.0000000
#> V47 0.0000000
#> V48 0.0000000
#> V49 0.0000000
#> V50 0.0000000
#> V51 0.0000000
It is also possible to obtain a prediction for new covariates. Let’s simulate values following the same simulation pattern used to obtain Z_missing, and look at the obtained prediction. Default configuration is to use coefficients for lambda.sd:
cov = cov_autoregressive(p_missing)
X = MASS::mvrnorm(1,mu=rep(0,p_missing),Sigma=cov)
beta = c(3,2,0,0,1.5,rep(0,p_missing - 5))
y = X %*% beta + rnorm(1,0,2)
y
#> [,1]
#> [1,] 3.354809
Let’s compare the prediction obtained with the lasso penalty and with the SCAD penalty :
y_predict.sd.lasso <- predict(fit_missing.lasso, newx = X, type="response")
y_predict.sd.lasso
#> [,1]
#> [1,] 2.020825
y_predict.sd.SCAD <- predict(fit_missing.SCAD, newx = X, type="response")
y_predict.sd.SCAD
#> [,1]
#> [1,] 2.044519
We can see that in this case using the SCAD penalty leads to less bias.
It is also possible to specify the lambda value with which we wish to perform the prediction:
y_predict.opt <- predict(fit_missing.lasso, newx = X, type="response", lambda.pred = fit_missing.lasso$lambda.opt)
y_predict.opt
#> 44
#> [1,] 2.717648
It is then possible to visualize the solution path of the coefficients:
BDcocolasso::plotCoef(fit_missing.lasso)
BDcocolasso::plotCoef(fit_missing.SCAD)
It is also possible to visualize the mean squared error for all values of lambda:
BDcocolasso::plotError(fit_missing.lasso)
BDcocolasso::plotError(fit_missing.SCAD)
The red stands for the mean squared error (without a constant term, which is why we obtain negative values), and the black depicts the standard deviation for each error value. On each of the plots, the left dashed line represents the optimal lambda, while the right dashed line represents the lambda corresponding to the one-standard-error rule.
knitr::knit_exit()
We can fit the BDCoCoLasso model for both datasets using :
if (params$NLessThanP) {
n_used <- 100
} else {
n_used <- 500
}
n_used
#> [1] 500
p1 <- 180
p2 <- 20
y_missing <- simulated_data_missing_block[1:n_used,1]
Z_missing = simulated_data_missing_block[1:n_used,2:dim(simulated_data_missing_block)[2]]
Z_missing = as.matrix(Z_missing)
dim(Z_missing)
#> [1] 500 200
n_missing <- dim(Z_missing)[1]
p_missing <- dim(Z_missing)[2]
y_missing = as.matrix(y_missing)
set.seed(123456)
profvis::profvis({
fit_missing = coco(Z=Z_missing,y=y_missing,n=n_missing,p=p_missing,p1=p1,
p2=p2,step=100,K=5,mu=10,tau=NULL,noise="missing",block=TRUE,
penalty="lasso")
})
#> [1] "Processing the global data"
#> [1] "entering for loop over the K folds"
#> [1] "Processing the 1 fold"
#> [1] "Processing the 2 fold"
#> [1] "Processing the 3 fold"
#> [1] "Processing the 4 fold"
#> [1] "Processing the 5 fold"
#> [1] "Lambda index: 1"
#> [1] "Lambda index: 2"
#> [1] "Lambda index: 3"
#> [1] "Lambda index: 4"
#> [1] "Lambda index: 5"
#> [1] "Lambda index: 6"
#> [1] "Lambda index: 7"
#> [1] "Lambda index: 8"
#> [1] "Lambda index: 9"
#> [1] "Lambda index: 10"
#> [1] "Lambda index: 11"
#> [1] "Lambda index: 12"
#> [1] "Lambda index: 13"
#> [1] "Lambda index: 14"
#> [1] "Lambda index: 15"
#> [1] "Lambda index: 16"
#> [1] "Lambda index: 17"
#> [1] "Lambda index: 18"
#> [1] "Lambda index: 19"
#> [1] "Lambda index: 20"
#> [1] "Lambda index: 21"
#> [1] "Lambda index: 22"
#> [1] "Lambda index: 23"
#> [1] "Lambda index: 24"
#> [1] "Lambda index: 25"
#> [1] "Lambda index: 26"
#> [1] "Lambda index: 27"
#> [1] "Lambda index: 28"
#> [1] "Lambda index: 29"
#> [1] "Lambda index: 30"
#> [1] "Lambda index: 31"
#> [1] "Lambda index: 32"
#> [1] "Lambda index: 33"
#> [1] "Lambda index: 34"
#> [1] "Lambda index: 35"
#> [1] "Lambda index: 36"
#> [1] "Lambda index: 37"
#> [1] "Lambda index: 38"
#> [1] "Lambda index: 39"
#> [1] "Lambda index: 40"
#> [1] "Lambda index: 41"
#> [1] "Lambda index: 42"
#> [1] "Lambda index: 43"
#> [1] "Lambda index: 44"
#> [1] "Lambda index: 45"
#> [1] "Lambda index: 46"
#> [1] "Lambda index: 47"
#> [1] "Lambda index: 48"
#> [1] "Lambda index: 49"
#> [1] "Lambda index: 50"
#> [1] "Early stopping because of error getting too high"
# this seed won't work. it hangs at lambda index: 53
# set.seed(12345)
# fit_missing = coco(Z=Z_missing,y=y_missing,n=n_missing,p=p_missing,p1=p1,p2=p2,step=100,K=5,mu=10,tau=NULL,noise="missing",block=TRUE, penalty="lasso")
Note that the algorithm requires that the first p1 columns of Z be the uncorrupted covariates, and that the last p2 columns be the corrupted covariates. It it also important to keep in mind that this algorithm has a relatively high computational cost. In the example given above, it is normal that the code should run for a couple of minutes before obtaining a result. When the number of features reaches one thousand, the memory demand would be high.
We can plot the coefficients and the error for the missing data scenario. Here we should expect to have 6 non-zero coefficients, with pairs of coefficients having similar values, since data was simulated with beta = c(3,2,0,0,1.5,0,…,0,1.5,0,0,2,3).
BDcocolasso::plotCoef(fit_missing)
BDcocolasso::plotError(fit_missing)
We can also use SCAD penalty for the Block-Descent-CoCoLasso :
set.seed(123456)
profvis::profvis({
fit_missing.SCAD = coco(Z=Z_missing,y=y_missing,n=n_missing,p=p_missing,p1=p1,
p2=p2,step=100,K=5,mu=10,tau=NULL,noise="missing",
block=TRUE, penalty="SCAD")
})
#> [1] "Processing the global data"
#> [1] "entering for loop over the K folds"
#> [1] "Processing the 1 fold"
#> [1] "Processing the 2 fold"
#> [1] "Processing the 3 fold"
#> [1] "Processing the 4 fold"
#> [1] "Processing the 5 fold"
#> [1] "Lambda index: 1"
#> [1] "Lambda index: 2"
#> [1] "Lambda index: 3"
#> [1] "Lambda index: 4"
#> [1] "Lambda index: 5"
#> [1] "Lambda index: 6"
#> [1] "Lambda index: 7"
#> [1] "Lambda index: 8"
#> [1] "Lambda index: 9"
#> [1] "Lambda index: 10"
#> [1] "Lambda index: 11"
#> [1] "Lambda index: 12"
#> [1] "Lambda index: 13"
#> [1] "Lambda index: 14"
#> [1] "Lambda index: 15"
#> [1] "Lambda index: 16"
#> [1] "Lambda index: 17"
#> [1] "Lambda index: 18"
#> [1] "Lambda index: 19"
#> [1] "Lambda index: 20"
#> [1] "Lambda index: 21"
#> [1] "Lambda index: 22"
#> [1] "Lambda index: 23"
#> [1] "Lambda index: 24"
#> [1] "Lambda index: 25"
#> [1] "Lambda index: 26"
#> [1] "Lambda index: 27"
#> [1] "Lambda index: 28"
#> [1] "Lambda index: 29"
#> [1] "Lambda index: 30"
#> [1] "Lambda index: 31"
#> [1] "Lambda index: 32"
#> [1] "Lambda index: 33"
#> [1] "Lambda index: 34"
#> [1] "Lambda index: 35"
#> [1] "Lambda index: 36"
#> [1] "Lambda index: 37"
#> [1] "Lambda index: 38"
#> [1] "Lambda index: 39"
#> [1] "Lambda index: 40"
#> [1] "Lambda index: 41"
#> [1] "Lambda index: 42"
#> [1] "Lambda index: 43"
#> [1] "Lambda index: 44"
#> [1] "Lambda index: 45"
#> [1] "Lambda index: 46"
#> [1] "Lambda index: 47"
#> [1] "Early stopping because of error getting too high"